林嶔 (Lin, Chin)
Lesson 21
– 回想一下我們在下半學期開始時所說的,如果我們能清楚X與Y實際上的關係,那就能精準的預測,但目前困難的是我們根本描述不出他的邏輯。
– 神經細胞的構造如下,不論是何種神經元皆可分成:接收區、觸發區、傳導區和輸出區。
透過樹突(dendrite)能接收上一個神經元的訊息,而有些會在接收訊息後產生抑制性作用,有些會產生興奮性作用,然後這些訊號再透過神經元整合,之後再透過軸突(axon)將訊號傳導出去。
我們根據這樣的生物學知識,開始來用電腦模擬一個簡單的神經元。
perceptron_v1 = function (x1, x2, w0, w1, w2) {
weighted.sum = w0 + x1 * w1 + x2 * w2
return(weighted.sum>0)
}
perceptron_v1(x1 = 1, x2 = 3, w0 = -1, w1 = 2, w2 = -1)
## [1] FALSE
perceptron_v1(x1 = 1, x2 = -2, w0 = -1, w1 = 2, w2 = -1)
## [1] TRUE
– 既然是邏輯斯回歸的簡單版,那就不用談了,我們已經很清楚邏輯斯回歸的極限了。
perceptron_v2 = function (x1, x2, w0, w1, w2) {
weighted.sum = w0 + x1 * w1 + x2 * w2
prop = 1/(1+exp(-weighted.sum))
return(prop)
}
perceptron_v2(x1 = 1, x2 = 3, w0 = -1, w1 = 2, w2 = -1)
## [1] 0.1192029
perceptron_v2(x1 = 1, x2 = -2, w0 = -1, w1 = 2, w2 = -1)
## [1] 0.9525741
set.seed(0)
x1 = rnorm(1000)
x2 = rnorm(1000)
lr1 = x1^2 + x2^2
p1 = 1/(1+exp(-lr1))
y1 = p1 > mean(p1)
plot(x1, x2, col = (y1 + 1)*2, pch = 19)
– 開始求解,使用最大概似估計法已樣本機率當指標
library(stats4)
Accuracy.y1 = function (w0, w1, w2) {
pred.y1 = perceptron_v2(x1 = x1, x2 = x2, w0 = w0, w1 = w1, w2 = w2)
lr = (log(pred.y1)*y1 + log(1-pred.y1)*(1-y1))
return(-sum(lr))
}
fit1 = mle(Accuracy.y1, start = list(w0 = 0, w1 = 0, w2 = 0), method = "SANN")
pred.y1 = perceptron_v2(x1 = x1, x2 = x2,
w0 = fit1@coef[1], w1 = fit1@coef[2], w2 = fit1@coef[3])
tab1 = table(pred.y1>0.5, y1)
print(tab1)
## y1
## FALSE TRUE
## FALSE 0 17
## TRUE 465 518
cat("Accuracy (Perceptron) = ", sum(diag(tab1))/sum(tab1))
## Accuracy (Perceptron) = 0.518
– 但有趣的事情就來了,畢竟大腦並非由單一神經元組成,假設我們把多個神經元堆疊在一起看看呢?(我們換個講法,就是把很多個邏輯斯回歸組合在一起)
perceptron_v2 = function (x1, x2, w0, w1, w2) {
weighted.sum = w0 + x1 * w1 + x2 * w2
prop = 1/(1+exp(-weighted.sum))
return(prop)
}
mynet = function (x1, x2, w01, w11, w21, w02, w12, w22, z0, z1, z2) {
h1 = perceptron_v2(x1 = x1, x2 = x2, w0 = w01, w1 = w11, w2 = w21)
h2 = perceptron_v2(x1 = x1, x2 = x2, w0 = w02, w1 = w12, w2 = w22)
o1 = perceptron_v2(x1 = h1, x2 = h2, w0 = z0, w1 = z1, w2 = z2)
return(o1)
}
mynet(x1 = 0, x2 = 1,
w01 = 0.1, w11 = 0.2, w21 = 0.3,
w02 = 0.4, w12 = 0.5, w22 = 0.6,
z0 = 0.7, z1 = 0.8, z2 = 0.9)
## [1] 0.862582
Accuracy_mynet.y1 = function (w01, w11, w21, w02, w12, w22, z0, z1, z2) {
pred.y1 = mynet(x1 = x1, x2 = x2,
w01 = w01, w11 = w11, w21 = w21,
w02 = w02, w12 = w12, w22 = w22,
z0 = z0, z1 = z1, z2 = z2)
lr = (log(pred.y1)*y1 + log(1-pred.y1)*(1-y1))
return(-sum(lr))
}
fit3 = mle(Accuracy_mynet.y1, start = list(w01 = 0, w11 = 0, w21 = 0, w02 = 0, w12 = 0, w22 = 0, z0 = 0, z1 = 0, z2 = 0), method = "SANN")
print(fit3)
##
## Call:
## mle(minuslogl = Accuracy_mynet.y1, start = list(w01 = 0, w11 = 0,
## w21 = 0, w02 = 0, w12 = 0, w22 = 0, z0 = 0, z1 = 0, z2 = 0),
## method = "SANN")
##
## Coefficients:
## w01 w11 w21 w02 w12 w22
## 10.1333278 2.6968961 -7.9747142 7.3321474 -0.9534551 6.3121327
## z0 z1 z2
## 14.3265385 -8.1730799 -7.1797248
pred.y1 = mynet(x1 = x1, x2 = x2,
w01 = fit3@coef[1], w11 = fit3@coef[2], w21 = fit3@coef[3],
w02 = fit3@coef[4], w12 = fit3@coef[5], w22 = fit3@coef[6],
z0 = fit3@coef[7], z1 = fit3@coef[8], z2 = fit3@coef[9])
tab2 = table(pred.y1>0.5, y1)
print(tab2)
## y1
## FALSE TRUE
## FALSE 432 181
## TRUE 33 354
cat("Accuracy (Neural Network) = ", sum(diag(tab2))/sum(tab2))
## Accuracy (Neural Network) = 0.786
– 我們換個數學上的說法,假定X與Y存在一個未知函數能夠完美預測,那神經網路將有能力逼近任何複雜函數
– 這下跟我們人腦不就一樣了?即使我們不知道手寫數字是如何辨識的(我們講不出他的邏輯),但我們就是有能力分辨手寫數字。
– 但由於這樣的結構是多個感知機的結合,為了和以後的神經網路作名稱上的區別,他通常被稱作「多層感知器」
– 但要注意一點,僅有64位元的作業系統能安裝MxNet。
– 他的安裝方法比較特別,並且有安裝GPU版本的方法,下面是安裝CPU版本的作法:
cran <- getOption("repos")
cran["dmlc"] <- "https://s3-us-west-2.amazonaws.com/apache-mxnet/R/CRAN/"
options(repos = cran)
install.packages("mxnet")
– 再次複習一下他的資料結構
DAT = read.csv("data/train.csv")
DAT = data.matrix(DAT)
#Split data
set.seed(0)
Train.sample = sample(1:nrow(DAT), nrow(DAT)*0.6, replace = FALSE)
Train.X = DAT[Train.sample,-1]/255
Train.Y = DAT[Train.sample,1]
Test.X = DAT[-Train.sample,-1]/255
Test.Y = DAT[-Train.sample,1]
#Display
library(imager)
par(mar=rep(0,4), mfcol = c(4, 4))
for (i in 1:16) {
plot(NA, xlim = 0:1, ylim = 0:1, xaxt = "n", yaxt = "n", bty = "n")
img = as.raster(t(matrix(as.numeric(Train.X[i,]), nrow = 28)))
rasterImage(img, -0.04, -0.04, 1.04, 1.04, interpolate=FALSE)
text(0.05, 0.95, Train.Y[i], col = "green", cex = 2)
}
– 先定義神經網路
library(mxnet)
data <- mx.symbol.Variable("data")
fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
act1 <- mx.symbol.Activation(fc1, name="sigmoid1", act_type="sigmoid")
fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=64)
act2 <- mx.symbol.Activation(fc2, name="sigmoid2", act_type="sigmoid")
fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
softmax <- mx.symbol.SoftmaxOutput(fc3, name="sm")
原始特徵28*28共784個特徵作為輸入神經元(輸入層),對128個神經元做輸入(隱藏層1)
整合計算完成後,做「sigmoid」函數轉換
隱藏層1的128個神經元所產生的128個作為二階特徵,對64個神經元做輸入(隱藏層2)
整合計算完成後,做「sigmoid」函數轉換
隱藏層2的64個神經元所產生的64個作為三階特徵,對10個神經元做輸入(輸出層)
使用「softmax」函數做預測
– 開始訓練(先訓練20輪)
mx.set.seed(0)
model = mx.model.FeedForward.create(softmax, X = Train.X, y = Train.Y,
ctx = mx.cpu(), num.round = 20, array.batch.size = 100,
learning.rate = 0.05, momentum = 0.9,
eval.metric = mx.metric.accuracy,
epoch.end.callback = mx.callback.log.train.metric(100))
prop.y = predict(model, Test.X[1:2,])
round(prop.y, 3)
## [,1] [,2]
## [1,] 0.993 0.000
## [2,] 0.000 0.999
## [3,] 0.000 0.000
## [4,] 0.000 0.000
## [5,] 0.000 0.000
## [6,] 0.004 0.000
## [7,] 0.001 0.000
## [8,] 0.000 0.000
## [9,] 0.000 0.000
## [10,] 0.001 0.000
按照這個預測,第一個數字有99.3%的機率是0,而第二個數字有99.9%的機率是1
讓我們重複一下他的過程:
params = model$arg.params
Input = matrix(Test.X[1,], nrow = 1) # 1x784
Weight_1 = as.matrix(as.array(params$fc1_weight)) #784x128
Bias_1 = t(as.matrix(as.array(params$fc1_bias))) #1x128
Hidden_1 = Input %*% Weight_1 + Bias_1 # 1x128
Sigmoid_1 = 1/(1+exp(-Hidden_1)) # 1x128
Weight_2 = as.matrix(as.array(params$fc2_weight)) #128x64
Bias_2 = t(as.matrix(as.array(params$fc2_bias))) #1x64
Hidden_2 = Sigmoid_1 %*% Weight_2 + Bias_2 # 1x64
Sigmoid_2 = 1/(1+exp(-Hidden_2)) # 1x64
Weight_3 = as.matrix(as.array(params$fc3_weight)) #64x10
Bias_3 = t(as.matrix(as.array(params$fc3_bias))) #1x10
Output = Sigmoid_2 %*% Weight_3 + Bias_3 # 1x10
Softmax.Output = exp(Output)/sum(exp(Output)) # 1x10
round(Softmax.Output, 3)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 0.993 0 0 0 0 0.004 0.001 0 0 0.001
– 「ReLU」函數非常簡單,意思就是負數都當作0,正數不變
– 在R內實現是這樣
x = rnorm(10)
print(x)
## [1] -1.1013901 0.3487389 0.4929406 0.6458858 0.2589455 -0.7041940
## [7] -1.1990471 -0.2599200 -0.1832568 -0.3809851
relu.x = x
relu.x[relu.x < 0] = 0
relu.x
## [1] 0.0000000 0.3487389 0.4929406 0.6458858 0.2589455 0.0000000 0.0000000
## [8] 0.0000000 0.0000000 0.0000000
data <- mx.symbol.Variable("data")
fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=64)
act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
softmax <- mx.symbol.SoftmaxOutput(fc3, name="sm")
mx.set.seed(0)
model = mx.model.FeedForward.create(softmax, X = Train.X, y = Train.Y,
ctx = mx.cpu(), num.round = 20, array.batch.size = 100,
learning.rate = 0.05, momentum = 0.9,
eval.metric = mx.metric.accuracy,
epoch.end.callback = mx.callback.log.train.metric(100))
preds = predict(model, Test.X)
pred.label = max.col(t(preds)) - 1
tab = table(pred.label, Test.Y)
cat("Testing accuracy rate =", sum(diag(tab))/sum(tab))
## Testing accuracy rate = 0.973869
print(tab)
## Test.Y
## pred.label 0 1 2 3 4 5 6 7 8 9
## 0 1642 0 6 1 4 4 6 1 1 7
## 1 0 1829 9 3 3 3 1 3 10 2
## 2 4 5 1614 11 3 4 5 12 7 3
## 3 2 1 9 1690 2 20 1 2 16 15
## 4 0 1 3 0 1562 2 8 4 1 14
## 5 2 1 0 17 0 1486 4 1 8 5
## 6 1 2 1 0 4 6 1633 0 4 1
## 7 1 2 8 1 4 0 0 1714 1 18
## 8 9 8 6 13 2 13 3 6 1622 8
## 9 2 2 0 6 22 13 0 10 5 1569
prop.y = predict(model, Test.X[1:2,])
round(prop.y, 3)
## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
## [3,] 0 0
## [4,] 0 0
## [5,] 0 0
## [6,] 0 0
## [7,] 0 0
## [8,] 0 0
## [9,] 0 0
## [10,] 0 0
– 當然有,那就是「過度擬和」。
– 這個問題其實又是一個「局部極值」問題,我們的抽樣肯定是有抽樣誤差,而這些抽樣誤差將導致我們的神經網路可能靠著辨認這些誤差來做分類,因此在「訓練集」達到100%的準確度,而「測試集」的準確度卻異常的低。
– 除此之外,這次我們還想同時在訓練時評估測試集的準確度
sub.Train.X = Train.X[1:1000,]
sub.Train.Y = Train.Y[1:1000]
sub.Test.X = Test.X[1:500,]
sub.Test.Y = Test.Y[1:500]
logger = mx.metric.logger$new()
mx.set.seed(0)
model = mx.model.FeedForward.create(softmax, X = sub.Train.X, y = sub.Train.Y,
eval.data = list(data = sub.Test.X, label = sub.Test.Y),
ctx = mx.cpu(), num.round = 100, array.batch.size = 30,
learning.rate = 0.05, momentum = 0.9,
eval.metric = mx.metric.accuracy,
epoch.end.callback = mx.callback.log.train.metric(100, logger))
plot(1:100, logger$train, xlab = "epoch", ylab = "accuracy", type = "l", col = "red")
lines(1:100, logger$eval, col = "blue")
legend("bottomright", c("Training", "Testing"), col = c("red", "blue"), lwd = 1)
– 除此之外,其實在訓練到第23代時測試集還擁有最高的準確度,而到了再之後反而略微下降了
– 那麼想問你,你該如何解決這個問題呢?
– 請你試著隨機產生10個小樣本,分別製作10個神經網路,並讓他們投票,看看結果如何。
sub.Train.X = Train.X[1:1000,]
sub.Train.Y = Train.Y[1:1000]
sub.Test.X = Test.X[1:500,]
sub.Test.Y = Test.Y[1:500]
set.seed(0)
model.list = list()
pred.list = list()
for (i in 1:10) {
sub.sample = sample(1:1000, 700)
sample.Train.X = sub.Train.X[sub.sample,]
sample.Train.Y = sub.Train.Y[sub.sample]
model.list[[i]] = mx.model.FeedForward.create(softmax, X = sample.Train.X, Train.X, y = sample.Train.Y,
ctx = mx.cpu(), num.round = 50,
array.batch.size = 30,
learning.rate = 0.05, momentum = 0.9,
eval.metric = mx.metric.accuracy,
epoch.end.callback = mx.callback.log.train.metric(100))
pred.list[[i+1]] = predict(model.list[[i]], sub.Test.X)
if (i==1) {pred.list[[1]] = pred.list[[i+1]]} else {pred.list[[1]] = pred.list[[1]] + pred.list[[i+1]]}
}
pred.list[[1]] = pred.list[[1]]/10
pred.label = max.col(t(pred.list[[1]])) - 1
tab = table(pred.label, sub.Test.Y)
cat("Testing accuracy rate =", sum(diag(tab))/sum(tab))
## Testing accuracy rate = 0.908
print(tab)
## sub.Test.Y
## pred.label 0 1 2 3 4 5 6 7 8 9
## 0 45 0 0 0 0 0 0 0 0 0
## 1 0 57 2 0 0 0 0 1 1 0
## 2 0 0 53 0 0 0 0 0 0 0
## 3 0 0 0 43 0 3 0 0 3 0
## 4 0 0 1 0 47 0 0 0 0 1
## 5 0 0 0 0 0 31 0 0 6 1
## 6 0 0 1 0 2 2 45 1 0 0
## 7 0 0 2 0 0 0 0 49 1 1
## 8 0 4 1 0 1 0 0 0 42 0
## 9 0 0 0 0 7 1 0 0 3 42
sub.Train.X = Train.X[1:1000,]
sub.Train.Y = Train.Y[1:1000]
sub.Test.X = Test.X[1:500,]
sub.Test.Y = Test.Y[1:500]
logger = mx.metric.logger$new()
mx.set.seed(0)
model = mx.model.FeedForward.create(softmax, X = sub.Train.X, y = sub.Train.Y,
eval.data = list(data = sub.Test.X, label = sub.Test.Y),
ctx = mx.cpu(), num.round = 100, array.batch.size = 30,
learning.rate = 0.05, momentum = 0.9, wd = 0.001,
eval.metric = mx.metric.accuracy,
epoch.end.callback = mx.callback.log.train.metric(100, logger))
plot(1:100, logger$train, xlab = "epoch", ylab = "accuracy", type = "l", col = "red")
lines(1:100, logger$eval, col = "blue")
legend("bottomright", c("Training", "Testing"), col = c("red", "blue"), lwd = 1)
sub.Train.X = Train.X[1:800,]
sub.Train.Y = Train.Y[1:800]
sub.Vald.X = Train.X[801:1000,]
sub.Vald.Y = Train.Y[801:1000]
sub.Test.X = Test.X[1:500,]
sub.Test.Y = Test.Y[1:500]
mx.callback.early.stop <- function(period, logger = NULL, small.value = "bad", tolerance = 1e-4) {
function(iteration, nbatch, env, verbose) {
if (nbatch %% period == 0 && !is.null(env$metric)) {
result <- env$metric$get(env$train.metric)
if (nbatch != 0) {
if(verbose) {cat(paste0("Batch [", nbatch, "] Train-", result$name, "=", result$value, "\n"))}
}
if (!is.null(logger)) {
if (class(logger) != "mx.metric.logger") {
stop("Invalid mx.metric.logger.")
} else {
logger$train <- c(logger$train, result$value)
if (!is.null(env$eval.metric)) {
result <- env$metric$get(env$eval.metric)
if (nbatch != 0) {cat(paste0("Batch [", nbatch, "] Validation-", result$name, "=", result$value, "\n"))}
logger$eval <- c(logger$eval, result$value)
}
}
}
}
if (!is.null(env$metric)) {
if (length(logger$train) >= 10) {
if (!is.null(env$eval.metric)) {TEST.VALUE = round(logger$eval/tolerance)} else {TEST.VALUE = round(logger$train/tolerance)}
if (small.value=="good") {
if (mean(tail(TEST.VALUE, 2)) <= mean(tail(TEST.VALUE, 1))) {return(FALSE)}
} else {
if (mean(tail(TEST.VALUE, 2)) >= mean(tail(TEST.VALUE, 1))) {return(FALSE)}
}
}
}
return(TRUE)
}
}
logger = mx.metric.logger$new()
mx.set.seed(0)
model = mx.model.FeedForward.create(softmax, X = sub.Train.X, y = sub.Train.Y,
eval.data = list(data = sub.Vald.X, label = sub.Vald.Y),
ctx = mx.cpu(), num.round = 100, array.batch.size = 30,
learning.rate = 0.05, momentum = 0.9,
eval.metric = mx.metric.accuracy,
epoch.end.callback = mx.callback.early.stop(100, logger))
preds = predict(model, sub.Test.X)
pred.label = max.col(t(preds)) - 1
tab = table(pred.label, sub.Test.Y)
cat("Testing accuracy rate =", sum(diag(tab))/sum(tab))
## Testing accuracy rate = 0.804
print(tab)
## sub.Test.Y
## pred.label 0 1 2 3 4 5 6 7 8 9
## 0 38 0 0 0 0 0 0 0 0 0
## 1 0 58 2 2 0 1 0 2 6 1
## 2 0 0 47 0 0 0 0 0 0 0
## 3 0 0 0 37 0 1 0 0 0 1
## 4 1 0 1 0 43 0 4 1 1 2
## 5 3 0 0 1 0 25 1 0 5 0
## 6 2 0 5 0 1 3 40 0 0 0
## 7 0 0 2 1 0 0 0 48 0 14
## 8 0 3 3 2 0 3 0 0 39 0
## 9 1 0 0 0 13 4 0 0 5 27
– 這個方法很像讓一整個班級的學生一起考同一份考試卷,久了以後有某幾個人會寫,那剩下的人就開始偷懶了,因此每次考試時抽走一半的學生,這樣所有的學生都必須學會如何考試
– 但這個方法訓練時間要久一點,我們把訓練次數設到200次
data <- mx.symbol.Variable("data")
fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
drop1 <- mx.symbol.Dropout(act1, p = 0.5)
fc2 <- mx.symbol.FullyConnected(drop1, name="fc2", num_hidden=64)
act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
drop2 <- mx.symbol.Dropout(act2, p = 0.5)
fc3 <- mx.symbol.FullyConnected(drop2, name="fc3", num_hidden=10)
softmax <- mx.symbol.SoftmaxOutput(fc3, name="sm")
sub.Train.X = Train.X[1:1000,]
sub.Train.Y = Train.Y[1:1000]
sub.Test.X = Test.X[1:500,]
sub.Test.Y = Test.Y[1:500]
logger = mx.metric.logger$new()
mx.set.seed(0)
model = mx.model.FeedForward.create(softmax, X = sub.Train.X, y = sub.Train.Y,
eval.data = list(data = sub.Test.X, label = sub.Test.Y),
ctx = mx.cpu(), num.round = 200, array.batch.size = 30,
learning.rate = 0.05, momentum = 0.9,
eval.metric = mx.metric.accuracy,
epoch.end.callback = mx.callback.log.train.metric(100, logger))
plot(1:200, logger$train, xlab = "epoch", ylab = "accuracy", type = "l", col = "red")
lines(1:200, logger$eval, col = "blue")
legend("bottomright", c("Training", "Testing"), col = c("red", "blue"), lwd = 1)
– 下週我們將進行比賽,看看誰的神經網路預測最為準確!
– 訓練樣本能夠使用seed為0所分割出來的訓練集
– 你可以隨時確認測試集中的準確率
DAT = read.csv("data/train.csv")
DAT = data.matrix(DAT)
#Split data
set.seed(0)
Train.sample = sample(1:nrow(DAT), nrow(DAT)*0.6, replace = FALSE)
Train.X = DAT[Train.sample,-1]/255
Train.Y = DAT[Train.sample,1]
Test.X = DAT[-Train.sample,-1]/255
Test.Y = DAT[-Train.sample,1]
我們使用函數「mle」求解時,其實他用的是「模擬退火演算法」,這與MCMC非常相似,可見效率必定奇差無比,在1970年代電腦運算能力不強的時代是很難進行神經「網路」的運算的。
在1986年,David Rumelhart、Geoffrey Hinton、Ronald Williams共同運用了「反向傳播法」來做參數求解,其實說穿了就是使用「微分連鎖率」結合「梯度下降法」,這個演算法終於讓當時的電腦有能力負荷
但人們很快就發現「深度神經網路」難以訓練,同個時期又有SVM、隨機森林等強大的統計方法問世,讓神經網路被世人唾棄
到了2006年,Geoffrey Hinton發展「受限波爾曼機」來進行「深度神經網路」的訓練,這解決了訓練時間的問題,但相較於SVM、隨機森林等方法運算時間還是過長,且準確度也沒有非常大的提升,此時的「神經網路」需要一個舞台讓世人重新認識他
2012年Geoffrey Hinton與學生Alex Krizhevsky一同參加電腦圖像識別大賽-ILSVRC,並且以壓倒性的優勢拿下冠軍,從此以後「神經網路」才重新受到重視
2016年Kaiming He與Microsoft團隊發展的神經網路,在ILSVRC上正式超越人類專家的極限,宣告「神經網路」時代重臨